Rare Variant eQTLs

I'm going to examine the eQTL results to determine whether any of the eQTLs are driven by rare variants that are at a higher frequency in our subject set.


In [1]:
import glob
import os
import random
import subprocess

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pybedtools as pbt
import seaborn as sns
import statsmodels.stats.multitest as smm
import vcf as pyvcf

import cardipspy as cpy
import ciepy

%matplotlib inline
%load_ext rpy2.ipython

dy_name = 'rare_variant_eqtls'

import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
    dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
    cpy.makedir(dy)
    pbt.set_tempdir(dy)
    
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)

In [2]:
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls01', 'lead_variants.tsv')
lead_vars = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls02', 'lead_variants.tsv')
lead_vars2 = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls03', 'lead_variants.tsv')
lead_vars3 = pd.read_table(fn, index_col=0)

fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'unrelated_eqtls01', 'lead_variants.tsv')
unr_lead_vars = pd.read_table(fn, index_col=0)

fn = os.path.join(ciepy.root, 'output', 'input_data', 'rnaseq_metadata.tsv')
rna_meta = pd.read_table(fn, index_col=0)
rna_meta_eqtl = rna_meta[rna_meta.in_eqtl]

fn = os.path.join(ciepy.root, 'output', 'input_data', 'subject_metadata.tsv')
subject_meta = pd.read_table(fn, index_col=0)

subject_meta_eqtl = subject_meta.ix[rna_meta_eqtl.subject_id]
rna_meta_eqtl = rna_meta_eqtl.merge(subject_meta_eqtl, left_on='subject_id', right_index=True)

In [3]:
lead_vars = lead_vars[lead_vars.perm_sig]
lead_vars = lead_vars[lead_vars.variant_type == 'snv']
lead_vars2 = lead_vars2[lead_vars2.perm_sig]
lead_vars2 = lead_vars2[lead_vars2.variant_type == 'snv']
lead_vars3 = lead_vars3[lead_vars3.perm_sig]
lead_vars3 = lead_vars3[lead_vars3.variant_type == 'snv']
unr_lead_vars = unr_lead_vars[unr_lead_vars.perm_sig]
unr_lead_vars = unr_lead_vars[unr_lead_vars.variant_type == 'snv']

In [4]:
af_cols = [x for x in lead_vars.columns if '_AF' in x]
lead_vars_af = lead_vars.dropna(subset=af_cols)
lead_vars2_af = lead_vars2.dropna(subset=af_cols)
lead_vars3_af = lead_vars3.dropna(subset=af_cols)

In [5]:
lead_vars[af_cols] = lead_vars[af_cols].fillna(0)

In [6]:
lead_vars_af.AF.hist()
plt.ylabel('Number of variants')
plt.xlabel('Alternate allele frequency');


The allele frequencies from 1,000 Genomes are for the alternate allele, so I need to take care when I define which allele is minor.


In [7]:
a = sum((lead_vars_af[af_cols] < 0.005).sum(axis=1) == len(af_cols))
b = sum((lead_vars_af[af_cols] > 1 - 0.005).sum(axis=1) == len(af_cols))
print('There are {} variants where the alternate is rare and {} '
      'where the reference is rare.'.format(a, b))


There are 143 variants where the alternate is rare and 0 where the reference is rare.

It doesn't seem like any of my eQTL lead variants that are in 1000 Genomes have rare reference alleles, so I can say that rare variants have the alternate allele as the minor allele.


In [8]:
lead_vars_af['rare'] = False
lead_vars_af.ix[lead_vars_af[(lead_vars_af[af_cols] < 0.005).sum(axis=1) == len(af_cols)].index, 'rare'] = True
a = lead_vars_af.rare.sum()
b = len(set(lead_vars_af.ix[lead_vars_af.rare, 'gene_id']))
print('There are {} rare lead variants for {} eGenes.'.format(a, b))


There are 143 rare lead variants for 106 eGenes.

In [9]:
se = lead_vars_af[lead_vars_af.rare].beta
weights = np.ones_like(se) / float(se.shape[0])
se.hist(bins=np.arange(-3, 3.1, 0.1), label='rare', alpha=0.5, weights=weights, histtype='stepfilled')

se = lead_vars_af[lead_vars_af.rare == False].beta
weights = np.ones_like(se) / float(se.shape[0])
se.hist(bins=np.arange(-3, 3.1, 0.1), label='not rare', alpha=0.5, weights=weights, histtype='stepfilled')
plt.xlim(-3, 3)
plt.xlabel('Beta')
plt.ylabel('Fraction of lead variants')
plt.title('All samples')
plt.legend();



In [10]:
af_cols = [x for x in unr_lead_vars.columns if '_AF' in x]
unr_lead_vars_af = unr_lead_vars.dropna(subset=af_cols)
unr_lead_vars_af['rare'] = False
unr_lead_vars_af.ix[unr_lead_vars_af[(unr_lead_vars_af[af_cols] < 0.005).sum(axis=1) == 
                                     len(af_cols)].index, 'rare'] = True

In [11]:
se = unr_lead_vars_af[unr_lead_vars_af.rare].beta
weights = np.ones_like(se) / float(se.shape[0])
se.hist(bins=np.arange(-3, 3.1, 0.1), label='rare', alpha=0.5, weights=weights, histtype='stepfilled')

se = unr_lead_vars_af[unr_lead_vars_af.rare == False].beta
weights = np.ones_like(se) / float(se.shape[0])
se.hist(bins=np.arange(-3, 3.1, 0.1), label='not rare', alpha=0.5, weights=weights, histtype='stepfilled')
plt.xlim(-3, 3)
plt.xlabel('Beta')
plt.ylabel('Fraction of lead variants')
plt.title('Unrelateds eQTL')
plt.legend();



In [12]:
def get_genotypes(df):
    genotypes = pd.DataFrame(np.nan, index=df.index, columns=rna_meta_eqtl.wgs_id)

    fn = '/projects/CARDIPS/analysis/cardips-ipsc-eqtl/private_output/eqtl_input/filtered_all/0000.vcf.gz'
    vcf_reader = pyvcf.VCFReader(open(fn))

    count = 0
    for i in df.index:
        if count % 1000 == 0:
            print(count)
        res = vcf_reader.fetch(df.ix[i, 'chrom'][3:], df.ix[i, 'end'],
                               df.ix[i, 'end'])
        r = res.next()

        hom_refs = set(rna_meta_eqtl.wgs_id) & set([x.sample for x in r.get_hom_refs()])
        hets = set(rna_meta_eqtl.wgs_id) & set([x.sample for x in r.get_hets()])
        hom_alts = set(rna_meta_eqtl.wgs_id) & set([x.sample for x in r.get_hom_alts()])

        genotypes.ix[i, hom_refs] = 0
        genotypes.ix[i, hets] = 1
        genotypes.ix[i, hom_alts] = 2
        count += 1
    return genotypes

In [13]:
out = os.path.join(private_outdir, 'genotypes.tsv')
if not os.path.exists(out):
    genotypes = get_genotypes(lead_vars_af)
    genotypes.to_csv(out, sep='\t')
else:
    genotypes = pd.read_table(out, index_col=0)
    
out = os.path.join(private_outdir, 'genotypes2.tsv')
if not os.path.exists(out):
    genotypes2 = get_genotypes(lead_vars2_af)
    genotypes2.to_csv(out, sep='\t')
else:
    genotypes2 = pd.read_table(out, index_col=0)
    
out = os.path.join(private_outdir, 'genotypes3.tsv')
if not os.path.exists(out):
    genotypes3 = get_genotypes(lead_vars3_af)
    genotypes3.to_csv(out, sep='\t')
else:
    genotypes3 = pd.read_table(out, index_col=0)

In [14]:
genotypes_f = genotypes.ix[lead_vars_af.index, rna_meta_eqtl.wgs_id]

In [15]:
lead_vars_af['ref_is_major'] = lead_vars_af.ac < lead_vars_af.ns

In [16]:
t = lead_vars_af[lead_vars_af.rare]
a = t[t.AF == 0]
plt.scatter(a.maf, a.AF, alpha=0.25, s=50)
a = t[t.AF != 0]
plt.scatter(a.maf, a.AF, alpha=0.25, s=50, color='red')
plt.ylim(-0.0001, 0.002)
plt.ylabel('1000 Genomes MAF')
plt.xlabel('CARDiPS MAF');



In [17]:
t = lead_vars_af[lead_vars_af.rare]
a = t[t.AF == 0]
plt.scatter(a.maf, a.AF, alpha=0.25, s=50)
a = t[t.AF != 0]
plt.scatter(a.maf, a.AF, alpha=0.25, s=50, color='red')
plt.ylim(-0.0001, 0.002)
plt.ylabel('1000 Genomes MAF')
plt.xlabel('CARDiPS MAF');
plt.xlim(0, 0.05)


Out[17]:
(0, 0.05)

In [18]:
lead_vars_uniq = lead_vars.drop_duplicates(subset='location')
lead_vars_uniq['minor_AF'] = lead_vars_uniq.AF
lead_vars_uniq.ix[lead_vars_uniq.minor_AF > 0.5, 'minor_AF'] = \
    1 - lead_vars_uniq.ix[lead_vars_uniq.minor_AF > 0.5, 'minor_AF']
lead_vars_uniq['ref_is_major'] = lead_vars_uniq.ac < lead_vars_uniq.ns

In [24]:
a = (lead_vars_uniq.AF == 0).value_counts()[False]
b = len(set(lead_vars_uniq.gene_id))
print('Obtained allele frequency for {:,} lead SNVs from {:,} eGenes from 1000 Genomes.\n'.format(a, b))

a = len(set(lead_vars_uniq[lead_vars_uniq.minor_AF < 0.005].gene_id))
nr = len(set(lead_vars_uniq[lead_vars_uniq.minor_AF < 0.005].location))
print('{:,} lead SNVs from {:,} eGenes were rare (MAF < 0.5%) in 1000 Genomes.'.format(nr, a))
n = (lead_vars_uniq[lead_vars_uniq.minor_AF < 0.005].maf > 0.05).value_counts()[True]
print('{:,} of these {:,} lead SNVs ({:.1f}%) are not rare among the 215 donors.'.format(n, nr, 100 * n / float(nr)))

a = (lead_vars_uniq.AF == 0).value_counts()[True]
print('{:,} of these {:,} lead SNVs were not seen in 1000 Genomes.'.format(a, nr))

a = len(set(lead_vars_uniq[(lead_vars_uniq.minor_AF < 0.005) & 
                           (lead_vars_uniq.minor_AF > 0)].gene_id))
b = len(set(lead_vars_uniq[(lead_vars_uniq.minor_AF < 0.005) &
                           (lead_vars_uniq.minor_AF > 0)].location))
print('{:,} of these {:,} lead SNVs from {:,} eGenes were observed in 1000 Genomes but were '
      'rare (MAF < 0.5%).\n'.format(b, nr, a))


Obtained allele frequency for 12,706 lead SNVs from 4,783 eGenes from 1000 Genomes.

305 lead SNVs from 185 eGenes were rare (MAF < 0.5%) in 1000 Genomes.
76 of these 305 lead SNVs (24.9%) are not rare among the 215 donors.
93 of these 305 lead SNVs were not seen in 1000 Genomes.
212 of these 305 lead SNVs from 118 eGenes were observed in 1000 Genomes but were rare (MAF < 0.5%).


In [25]:
jg = sns.jointplot(lead_vars_uniq.maf, lead_vars_uniq.minor_AF, alpha=0.2, stat_func=None);



In [48]:
lead_vars_uniq.to_csv(os.path.join(outdir, 'lead_vars_uniq.tsv'), sep='\t')

In [29]:
wgs_meta = subject_meta.copy(deep=True)
wgs_meta = wgs_meta.ix[rna_meta[rna_meta.in_eqtl].subject_id]
se = pd.Series(rna_meta[rna_meta.in_eqtl].wgs_id.values, index=rna_meta[rna_meta.in_eqtl].subject_id)
wgs_meta.index = se[list(wgs_meta.index)].values

In [30]:
family_vc = subject_meta_eqtl.family_id.value_counts()
family_vc = family_vc[family_vc * 2 >= lead_vars_uniq.ac.min()]

In [31]:
for i in family_vc.index:
    se = genotypes.ix[lead_vars_uniq.index, 
                      rna_meta_eqtl[rna_meta_eqtl.family_id == i].wgs_id].sum(axis=1)
    c = '{}_maf'.format(i)
    lead_vars_uniq[c] = np.nan
    lead_vars_uniq.ix[lead_vars_uniq.ref_is_major, c] = se[lead_vars_uniq.ref_is_major.values] / (family_vc[i] * 2)
    lead_vars_uniq.ix[lead_vars_uniq.ref_is_major == False, c] = \
        (family_vc[i] * 2 - se[(lead_vars_uniq.ref_is_major == False).values]) / (family_vc[i] * 2)
    c = '{}_mac'.format(i)
    lead_vars_uniq.ix[lead_vars_uniq.ref_is_major, c] = se[lead_vars_uniq.ref_is_major.values]
    lead_vars_uniq.ix[(lead_vars_uniq.ref_is_major == False).values, c] = \
        (family_vc[i] * 2 - se[(lead_vars_uniq.ref_is_major == False).values])

In [38]:
t = lead_vars_uniq[lead_vars_uniq.minor_AF < 0.005]
num_fams = []
for i in t.index:
    se = genotypes.ix[i]
    mode = se.mode().values[0]
    num_fams.append(len(set(wgs_meta.ix[se[se != mode].index, 'family_id'])))
num_fams = pd.Series(num_fams)
a = sum(num_fams >= 2)
print('{:,} of {:,} rare lead variants were observed in more than one family.'.format(a, t.shape[0]))


272 of 305 rare lead variants were observed in more than one family.

In [47]:
ax = num_fams.value_counts().sort_index().plot.bar()
ax.set_ylabel('Number of lead SNVs')
ax.set_xlabel('Number of families observed in');



In [49]:
num_fams.to_csv(os.path.join(outdir, 'rare_num_fams.tsv'), sep='\t')

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
family_vc = subject_meta_eqtl.family_id.value_counts()
family_vc = family_vc[family_vc * 2 >= lead_vars_af.ac.min()]

In [21]:
for i in family_vc.index:
    se = genotypes_f[rna_meta_eqtl[rna_meta_eqtl.family_id == i].wgs_id].sum(axis=1)
    c = '{}_maf'.format(i)
    lead_vars_af[c] = np.nan
    lead_vars_af.ix[lead_vars_af.ref_is_major, c] = se[lead_vars_af.ref_is_major] / (family_vc[i] * 2)
    lead_vars_af.ix[lead_vars_af.ref_is_major == False, c] = \
        (family_vc[i] * 2 - se[lead_vars_af.ref_is_major == False]) / (family_vc[i] * 2)
    c = '{}_mac'.format(i)
    lead_vars_af.ix[lead_vars_af.ref_is_major, c] = se[lead_vars_af.ref_is_major]
    lead_vars_af.ix[lead_vars_af.ref_is_major == False, c] = \
        (family_vc[i] * 2 - se[lead_vars_af.ref_is_major == False])

In [22]:
c = '{}_maf'.format(family_vc.index[1])
t = lead_vars_af[lead_vars_af[c] > lead_vars_af.maf]
t = t[t.rare]
t = t[t.maf < 0.05]

In [23]:
plt.scatter(t.maf, t[c])


Out[23]:
<matplotlib.collections.PathCollection at 0x7f14f6fd9cd0>

In [24]:
for i in family_vc.index:
    c = '{}_mac'.format(i)
    a = set(subject_meta.ix[subject_meta.family_id == i, 'ethnicity_group'])
    print(i, len(set(lead_vars_af.ix[lead_vars_af.ac == lead_vars_af[c], 'gene_id'])), list(a))


('abb401f1-5c3e-48ed-8c55-839ce2afe7e6', 1, ['Hispanic', 'African American', 'Multiple ethnicities reported', 'European'])
('84fda65d-9a06-4bbe-ad75-a24773724c32', 2, ['Asian', 'Multiple ethnicities reported', 'European'])
('a56f620d-82e4-4963-a80b-b97c091c4b06', 0, ['Multiple ethnicities reported', 'European'])
('319aca18-4ca4-444d-80ca-3c0065169793', 0, ['Asian'])
('0a78a1f4-540f-4f63-acd1-197228abad11', 0, ['European'])
('cec0061a-f86c-4d8f-979f-3c6edce2d905', 0, ['European'])
('24bb1db3-75fb-4c66-9ccd-8e98dc3fadf1', 0, ['European'])
('f9d0c0e4-4408-469a-8c14-e65352142cc9', 0, ['Asian'])
('225e50a6-3386-4df5-b47d-6a564cde04d3', 0, ['Asian'])
('a23bb99f-656b-4853-a029-2400ec16dae6', 0, ['European'])
('2c54a579-6ba4-45be-b3cc-e2003349acd7', 0, ['European'])
('0fef459e-1c58-4bf8-9b2e-48da79a78536', 0, ['European'])
('bdc08658-e9d6-45ce-8f51-57775682c800', 0, ['European'])
('25e71a57-1993-4844-a0e1-f273522a8d05', 0, ['European'])
('bdc7cf80-9790-410e-abf0-6085d2881790', 0, ['European'])
('a11862e5-d800-4f1b-acc0-41d7368c2b32', 0, ['European'])
('d4bfdb4d-593c-4216-8a85-85bc98ec8f50', 0, ['Asian'])
('6c5edc7b-4dde-4802-80e7-995b62133292', 0, ['Asian', 'Multiple ethnicities reported', 'European'])
('530532cb-78cc-41e8-a2bc-a30e9108221b', 0, ['Multiple ethnicities reported', 'European'])
('80d4fbb6-6aaf-4414-acf9-e75d02ba15a6', 0, ['European'])
('f5352d65-a24e-4d0c-84a8-0539e396a5e4', 0, ['Multiple ethnicities reported', 'European'])

In [21]:
subject_meta[subject_meta.family_id == '84fda65d-9a06-4bbe-ad75-a24773724c32'].head()


Out[21]:
sex age family_id father_id mother_id twin_id ethnicity_group
id
f549b5fa-a6c0-49fb-8a07-dda4f72ff076 M 51.9 84fda65d-9a06-4bbe-ad75-a24773724c32 NaN NaN NaN European
2a81d225-42f6-43cf-904a-be64bf3fd2d7 F 45.2 84fda65d-9a06-4bbe-ad75-a24773724c32 6a784dd7-9c48-4841-93f9-930b9de49cc4 ae3637d4-493e-482e-bc48-6915e30ffb9c NaN Multiple ethnicities reported
e932e556-59a6-4f70-9b4c-ef5f69dac3ce F 48.3 84fda65d-9a06-4bbe-ad75-a24773724c32 6a784dd7-9c48-4841-93f9-930b9de49cc4 ae3637d4-493e-482e-bc48-6915e30ffb9c NaN Multiple ethnicities reported
de9c74e8-d399-465e-92cd-904a0e323bca M 50.2 84fda65d-9a06-4bbe-ad75-a24773724c32 6a784dd7-9c48-4841-93f9-930b9de49cc4 ae3637d4-493e-482e-bc48-6915e30ffb9c NaN European
f772212a-cd98-40f9-bdcd-e75740c1f6be F 18.1 84fda65d-9a06-4bbe-ad75-a24773724c32 f549b5fa-a6c0-49fb-8a07-dda4f72ff076 e932e556-59a6-4f70-9b4c-ef5f69dac3ce NaN European

In [21]:
lead_vars_af.to_csv(os.path.join(outdir, 'lead_vars_af.tsv'), sep='\t')
unr_lead_vars_af.to_csv(os.path.join(outdir, 'unr_lead_vars_af.tsv'), sep='\t')

In [28]:
tdf = lead_vars_af[lead_vars_af.rare]

In [31]:
plt.scatter(tdf.AF, tdf.beta)


Out[31]:
<matplotlib.collections.PathCollection at 0x7f8040262ad0>

In [46]:
t = lead_vars_af[lead_vars_af.ac == lead_vars_af['84fda65d-9a06-4bbe-ad75-a24773724c32_mac']]
t.sort_values(by='pvalue', inplace=True)
t.drop_duplicates(subset=['gene_id']).T.head(26)


Out[46]:
chr14:23615514-23615515:ENSG00000100897.13 chr11:66914128-66914129:ENSG00000173992.4
chrom chr14 chr11
start 23615514 66914128
end 23615515 66914129
marker_id 14:23615515_G/A_rs117617634 11:66914129_A/G
ns 215 215
ac 9 8
callrate 1 1
genocnt 206/9/0 207/8/0
maf 0.02093 0.0186
stat -4.8404 -4.4335
pvalue 2.494e-06 1.488e-05
beta -2.031 -1.843
sebeta 0.4197 0.4157
r2 0.09952 0.08485
gene_id ENSG00000100897.13 ENSG00000173992.4
ref G A
alt A G
variant_type snv snv
variant_caller gatk gatk
location chr14:23615514-23615515 chr11:66914128-66914129
rsid rs117617634 NaN
length NaN NaN
tss_dist -967889 546223
tss_dist_abs 967889 546223
gene_name DCAF11 CCS
gene_type protein_coding protein_coding

In [47]:
t = lead_vars_af[lead_vars_af.ac == lead_vars_af['abb401f1-5c3e-48ed-8c55-839ce2afe7e6_mac']]
t.sort_values(by='pvalue', inplace=True)
t.drop_duplicates(subset=['gene_id']).T.head(26)


Out[47]:
chr7:111481507-111481508:ENSG00000006652.9
chrom chr7
start 111481507
end 111481508
marker_id 7:111481508_C/T
ns 215
ac 6
callrate 1
genocnt 209/6/0
maf 0.01395
stat 4.374
pvalue 1.913e-05
beta 1.887
sebeta 0.4313
r2 0.08277
gene_id ENSG00000006652.9
ref C
alt T
variant_type snv
variant_caller gatk
location chr7:111481507-111481508
rsid NaN
length NaN
tss_dist -581515
tss_dist_abs 581515
gene_name IFRD1
gene_type protein_coding

TODO: Are any of the lead variants for the second or third eQTLs rare?


In [50]:
sum(lead_vars2.AF < 0.005)


Out[50]:
49

In [51]:
sum(lead_vars3.AF < 0.005)


Out[51]:
46

In [ ]: